tutorials-and-examples/how-tos/Automation Setup - Configure Azure Machine Learning Pipelines.ipynb (455 lines of code) (raw):
{
"cells": [
{
"cell_type": "markdown",
"source": [
"# Automation Setup - Configure Azure Machine Learning Pipelines\r\n",
"\r\n",
"__Notebook Version:__ 1.0<br>\r\n",
"__Python Version:__ Python 3.8 - AzureML<br>\r\n",
"__Required Packages:__ No<br>\r\n",
"__Platforms Supported:__ Azure Machine Learning Notebooks\r\n",
" \r\n",
"__Data Source Required:__ No \r\n",
" \r\n",
"### Description\r\n",
"This is the second notebook of series for setting up Microsoft Sentinel notebook automation platform based on Azure Machine Learning Pipelines.</br>\r\n",
"Before starting this notebook, you should have a notebook to be executed automatically ready. </br>\r\n",
"This notebook provides step-by-step instructions to create Azure Machine Learning Pipeline, publish it, and schedule to run the pipeline to execute the targeted notebook.</br>\r\n",
"\r\n",
"*** Please run the cells sequentially to avoid errors. Please do not use \"run all cells\". *** <br>\r\n",
"\r\n",
"## Table of Contents\r\n",
"1. Warm-up\r\n",
"2. Authentication to Azure Resources\r\n",
"3. Azure Machine Learning Pipleline"
],
"metadata": {
"nteract": {
"transient": {
"deleting": false
}
}
}
},
{
"cell_type": "markdown",
"source": [
"## 1. Warm-up"
],
"metadata": {
"nteract": {
"transient": {
"deleting": false
}
}
}
},
{
"cell_type": "code",
"source": [
"# Azure Machine Learning and Pipeline SDK-specific imports\r\n",
"# azureml\r\n",
"import azureml.core\r\n",
"from azureml.core import Workspace, Experiment\r\n",
"from azureml.core.datastore import Datastore\r\n",
"from azureml.core.runconfig import RunConfiguration\r\n",
"from azureml.core.conda_dependencies import CondaDependencies\r\n",
"from azureml.contrib.notebook import NotebookRunConfig, AzureMLNotebookHandler\r\n",
"from azureml.pipeline.core import Pipeline\r\n",
"from azureml.pipeline.core import PipelineData\r\n",
"from azureml.contrib.notebook import NotebookRunnerStep\r\n",
"from azureml.pipeline.core.schedule import ScheduleRecurrence, Schedule\r\n",
"\r\n",
"# azure common/core\r\n",
"from azure.common.credentials import get_azure_cli_credentials\r\n",
"from azure.mgmt.resource import ResourceManagementClient\r\n",
"\r\n",
"# Python/ipython\r\n",
"import json\r\n",
"from datetime import datetime\r\n",
"from IPython.display import display, HTML, Markdown\r\n",
"\r\n",
"# Check core SDK version number\r\n",
"print(\"SDK version:\", azureml.core.VERSION)"
],
"outputs": [],
"execution_count": null,
"metadata": {
"gather": {
"logged": 1642211715022
}
}
},
{
"cell_type": "code",
"source": [
"# Functions will be used in this notebook\r\n",
"def read_config_values(file_path):\r\n",
" \"This loads pre-generated parameters for Microsoft Sentinel Workspace\"\r\n",
" with open(file_path) as json_file:\r\n",
" if json_file:\r\n",
" json_config = json.load(json_file)\r\n",
" return (json_config[\"tenant_id\"],\r\n",
" json_config[\"subscription_id\"],\r\n",
" json_config[\"resource_group\"],\r\n",
" json_config[\"workspace_id\"],\r\n",
" json_config[\"workspace_name\"],\r\n",
" json_config[\"user_alias\"],\r\n",
" json_config[\"user_object_id\"])\r\n",
" return None\r\n",
"\r\n",
"def has_valid_token():\r\n",
" \"Check to see if there is a valid AAD token\"\r\n",
" try:\r\n",
" credentials, sub_id = get_azure_cli_credentials()\r\n",
" creds = credentials._get_cred(resource=None)\r\n",
" token = creds._token_retriever()[2]\r\n",
" print(\"Successfully signed in.\")\r\n",
" return True\r\n",
" except Exception as ex:\r\n",
" if \"Please run 'az login' to setup account\" in str(ex):\r\n",
" print(\"Please sign in first.\")\r\n",
" return False\r\n",
" elif \"AADSTS70043: The refresh token has expired\" in str(ex):\r\n",
" message = \"**The refresh token has expired. <br> Please continue your login process. Then: <br> 1. If you plan to run multiple notebooks on the same compute instance today, you may restart the compute instance by clicking 'Compute' on left menu, then select the instance, clicking 'Restart'; <br> 2. Otherwise, you may just restart the kernel from top menu. <br> Finally, close and re-load the notebook, then re-run cells one by one from the top.**\"\r\n",
" display(Markdown(message))\r\n",
" return False\r\n",
" elif \"[Errno 2] No such file or directory: '/home/azureuser/.azure/azureProfile.json'\" in str(ex):\r\n",
" print(\"Please sign in.\")\r\n",
" return False\r\n",
" else:\r\n",
" print(str(ex))\r\n",
" return False\r\n",
" except:\r\n",
" print(\"Please restart the kernel, and run 'az login'.\")\r\n",
" return False"
],
"outputs": [],
"execution_count": null,
"metadata": {
"jupyter": {
"source_hidden": false,
"outputs_hidden": false
},
"nteract": {
"transient": {
"deleting": false
}
},
"gather": {
"logged": 1642211717929
}
}
},
{
"cell_type": "code",
"source": [
"# Calling the above function to populate Microsoft Sentinel workspace parameters\r\n",
"# The file, config.json, was generated by the system, however, you may modify the values, or manually set the variables\r\n",
"tenant_id, subscription_id, resource_group, workspace_id, workspace_name, user_alias, user_object_id = read_config_values('config.json');\r\n",
"print(\"Subscription Id: \" + subscription_id)"
],
"outputs": [],
"execution_count": null,
"metadata": {
"jupyter": {
"source_hidden": false,
"outputs_hidden": false
},
"nteract": {
"transient": {
"deleting": false
}
},
"gather": {
"logged": 1642211719915
}
}
},
{
"cell_type": "markdown",
"source": [
"## 2. Authentication to Azure Resources"
],
"metadata": {
"nteract": {
"transient": {
"deleting": false
}
}
}
},
{
"cell_type": "code",
"source": [
"# Azure CLI is used to get device code to login into Azure, you need to copy the code and open the DeviceLogin site.\r\n",
"# You may add [--tenant $tenant_id] to the command\r\n",
"if has_valid_token() == False:\r\n",
" !echo -e '\\e[42m'\r\n",
" !az login --tenant $tenant_id --use-device-code"
],
"outputs": [],
"execution_count": null,
"metadata": {
"jupyter": {
"source_hidden": false,
"outputs_hidden": false
},
"nteract": {
"transient": {
"deleting": false
}
},
"gather": {
"logged": 1642211721898
}
}
},
{
"cell_type": "markdown",
"source": [
"## 3. Azure Machine Learning Pipleline"
],
"metadata": {
"nteract": {
"transient": {
"deleting": false
}
}
}
},
{
"cell_type": "code",
"source": [
"# 1. Enter resource names\r\n",
"# Enter name of an Azure resource group\r\n",
"resource_group = 'myresourcegroup'\r\n",
"# Enter current AML workspace name\r\n",
"current_aml_workspace_name = 'auto2022'\r\n",
"# Enter compute cluster name\r\n",
"amlcompute_cluster_name = 'compcl2022'\r\n"
],
"outputs": [],
"execution_count": null,
"metadata": {
"jupyter": {
"source_hidden": false,
"outputs_hidden": false
},
"nteract": {
"transient": {
"deleting": false
}
},
"gather": {
"logged": 1642211723878
}
}
},
{
"cell_type": "code",
"source": [
"# 2. Get AML workspace\r\n",
"ws = Workspace.get(name=current_aml_workspace_name, subscription_id=subscription_id, resource_group=resource_group)\r\n",
"print(ws)\r\n",
"ws.set_default_datastore(\"workspaceblobstore\")\r\n",
"datastore = Datastore.get(ws, \"workspaceblobstore\")"
],
"outputs": [],
"execution_count": null,
"metadata": {
"jupyter": {
"source_hidden": false,
"outputs_hidden": false
},
"nteract": {
"transient": {
"deleting": false
}
},
"gather": {
"logged": 1642211729390
}
}
},
{
"cell_type": "code",
"source": [
"# 3. Create a new RunConfig object\r\n",
"source_directory = ''\r\n",
"notebook_name = 'Automation Gallery - Credential Scan on Azure Blob Storage.ipynb'\r\n",
"output_notebook_name = 'blob_scan_results.ipynb'\r\n",
"conda_run_config = RunConfiguration(framework=\"python\")\r\n",
"conda_run_config.environment.docker.base_image = azureml.core.runconfig.DEFAULT_CPU_IMAGE\r\n",
"print('conda-run config is ready')\r\n",
"\r\n",
"# Create notebook run configuration and set parameters values\r\n",
"handler = AzureMLNotebookHandler(timeout=600, progress_bar=False, log_output=True)\r\n",
"cfg = NotebookRunConfig(source_directory=source_directory, notebook=notebook_name,\r\n",
" handler = handler,\r\n",
" parameters={},\r\n",
" run_config=conda_run_config,\r\n",
" output_notebook=output_notebook_name)\r\n",
"\r\n",
"print(\"Notebook Run Config is created.\")"
],
"outputs": [],
"execution_count": null,
"metadata": {
"jupyter": {
"source_hidden": false,
"outputs_hidden": false
},
"nteract": {
"transient": {
"deleting": false
}
},
"gather": {
"logged": 1642212053102
}
}
},
{
"cell_type": "code",
"source": [
"# 4. Define NotebookRunnerStep\r\n",
"#my_pipeline_param = PipelineParameter(name=\"my_pipeline_param\", default_value=datetime.now().strftime(\"%Y-%m-%d %H:%M:%S\"))\r\n",
"output_name = \"notebookresult_2022\"\r\n",
"\r\n",
"output_from_notebook = PipelineData(name=\"notebook_processed_data\", datastore=Datastore.get(ws, \"workspaceblobstore\"),output_overwrite=True, output_mode=\"upload\")\r\n",
"notebook_runner_step = NotebookRunnerStep(name=\"sentinel_notebook_step\",\r\n",
" notebook_run_config=cfg,\r\n",
" params = {},\r\n",
" # params={\"my_pipeline_param\": my_pipeline_param},\r\n",
" inputs=[],\r\n",
" outputs=[], \r\n",
" allow_reuse=False,\r\n",
" compute_target=amlcompute_cluster_name,\r\n",
" output_notebook_pipeline_data_name=output_name)\r\n",
"\r\n",
"print(\"Notebook Runner Step is Created.\")"
],
"outputs": [],
"execution_count": null,
"metadata": {
"jupyter": {
"source_hidden": false,
"outputs_hidden": false
},
"nteract": {
"transient": {
"deleting": false
}
},
"gather": {
"logged": 1642212057899
}
}
},
{
"cell_type": "code",
"source": [
"# 5. Build Pipeline and publish it\r\n",
"pipeline4sentinel = Pipeline(workspace=ws, steps=[notebook_runner_step])\r\n",
"print(\"Pipeline creation complete\")\r\n",
"\r\n",
"# Publish the pipeline\r\n",
"timenow = datetime.now().strftime('%Y-%m-%d-%H-%M')\r\n",
"pipeline_name = \"Sentinel-Pipeline-\" + timenow\r\n",
"\r\n",
"published_sentinel_pipeline = pipeline4sentinel.publish(\r\n",
" name=pipeline_name, \r\n",
" description=pipeline_name)\r\n",
"print(\"Newly published pipeline id: {}\".format(published_sentinel_pipeline.id))\r\n",
"print(\"Endpoint: {}\".format(published_sentinel_pipeline.endpoint))"
],
"outputs": [],
"execution_count": null,
"metadata": {
"jupyter": {
"source_hidden": false,
"outputs_hidden": false
},
"nteract": {
"transient": {
"deleting": false
}
},
"gather": {
"logged": 1642212067884
}
}
},
{
"cell_type": "code",
"source": [
"# 6. Create a schedule for the published pipeline using a recurrence\r\n",
"schedule_name = 'sentinel_schedule'\r\n",
"experiment_name = 'sentinel_experiment_2022'\r\n",
"recurrence = ScheduleRecurrence(frequency=\"Day\", interval=1, hours=[22], minutes=[30]) # Runs every other day at 10:30pm\r\n",
"#recurrence = ScheduleRecurrence(frequency=\"Hour\", interval=8) # Runs every two hours \r\n",
"\r\n",
"schedule = Schedule.create(workspace=ws, name=schedule_name,\r\n",
" pipeline_id=published_sentinel_pipeline.id, \r\n",
" experiment_name=experiment_name,\r\n",
" recurrence=recurrence,\r\n",
" wait_for_provisioning=True,\r\n",
" description=\"Schedule to run Sentinel notebook\")\r\n",
"\r\n",
"print(\"Created schedule with id: {}\".format(schedule.id))"
],
"outputs": [],
"execution_count": null,
"metadata": {
"jupyter": {
"source_hidden": false,
"outputs_hidden": false
},
"nteract": {
"transient": {
"deleting": false
}
},
"gather": {
"logged": 1642212083888
}
}
}
],
"metadata": {
"kernelspec": {
"name": "python38-azureml",
"language": "python",
"display_name": "Python 3.8 - AzureML"
},
"language_info": {
"name": "python",
"version": "3.8.1",
"mimetype": "text/x-python",
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"pygments_lexer": "ipython3",
"nbconvert_exporter": "python",
"file_extension": ".py"
},
"kernel_info": {
"name": "python38-azureml"
},
"microsoft": {
"host": {
"AzureML": {
"notebookHasBeenCompleted": true
}
}
},
"nteract": {
"version": "nteract-front-end@1.0.0"
}
},
"nbformat": 4,
"nbformat_minor": 0
}